Skip to content

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Sep 5, 2025

This fixes a regression exposed after 4454152.
This introduces a few small regressions for true16. There are more cases
where the value can propagate through subregister extracts which need
new handling. They're also small enough that perhaps there's a way to avoid
needing to deal with this case in the first place.

Copy link
Contributor Author

arsenm commented Sep 5, 2025

This stack of pull requests is managed by Graphite. Learn more about stacking.

@llvmbot
Copy link
Member

llvmbot commented Sep 5, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This fixes a regression exposed after 4454152.
This introduces a few small regressions for true16. There are more cases
where the value can propagate through subregister extracts which need
new handling. They're also small enough that perhaps there's a way to avoid
needing to deal with this case in the first place.


Patch is 75.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157032.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (-29)
  • (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+39-18)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+346-210)
  • (modified) llvm/test/CodeGen/AMDGPU/true16-fold.mir (+35-1)
  • (added) llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll (+29)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 5297816ec1f2b..bd9762b0005e1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1129,40 +1129,11 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
   if (!AMDGPU::isSISrcOperand(Desc, UseOpIdx))
     return false;
 
-  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
     return true;
   }
 
-  // TODO: Verify the following code handles subregisters correctly.
-  // TODO: Handle extract of global reference
-  if (UseOp.getSubReg())
-    return false;
-
-  if (!OpToFold.isReg())
-    return false;
-
-  Register UseReg = OpToFold.getReg();
-  if (!UseReg.isVirtual())
-    return false;
-
-  // Maybe it is just a COPY of an immediate itself.
-
-  // FIXME: Remove this handling. There is already special case folding of
-  // immediate into copy in foldOperand. This is looking for the def of the
-  // value the folding started from in the first place.
-  MachineInstr *Def = MRI->getVRegDef(UseReg);
-  if (Def && TII->isFoldableCopy(*Def)) {
-    MachineOperand &DefOp = Def->getOperand(1);
-    if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
-      FoldableDef FoldableImm(DefOp.getImm(), OpToFold.DefRC,
-                              OpToFold.DefSubReg);
-      appendFoldCandidate(FoldList, UseMI, UseOpIdx, FoldableImm);
-      return true;
-    }
-  }
-
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 899cc89405440..2fa4883e4945b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -2138,12 +2138,33 @@ define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_fma_mul_add_32_v2f64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
-; GFX1011-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x double> %x, <double 32.0, double 32.0>
   %fma = fadd contract <2 x double> %mul, %y
   ret <2 x double> %fma
@@ -2501,8 +2522,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x40300000, v[2:3]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_v2f64:
@@ -2515,8 +2536,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x40300000, v[2:3]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double 16.0, double 16.0>
   ret <2 x double> %mul
@@ -2549,8 +2570,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0xc0300000, v[2:3]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_v2f64:
@@ -2563,8 +2584,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0xc0300000, v[2:3]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double -16.0, double -16.0>
   ret <2 x double> %mul
@@ -2597,8 +2618,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, |v[0:1]|
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], 0x40300000, |v[2:3]|
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f64:
@@ -2611,8 +2632,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, |v[0:1]|
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], 0x40300000, |v[2:3]|
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   %mul = fmul <2 x double> %x.fabs, <double 16.0, double 16.0>
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 18c462ffd0ff5..dd2cffd7bd161 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -77,17 +77,53 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
 ; GFX10CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10CHECK-NEXT:    s_endpgm
 ;
-; GFX11CHECK-LABEL: sgpr_isnan_f16:
-; GFX11CHECK:       ; %bb.0:
-; GFX11CHECK-NEXT:    s_clause 0x1
-; GFX11CHECK-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_cmp_class_f16_e64 s2, s2, 3
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11CHECK-NEXT:    s_endpgm
+; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16:
+; GFX11SELDAG-TRUE16:       ; %bb.0:
+; GFX11SELDAG-TRUE16-NEXT:    s_clause 0x1
+; GFX11SELDAG-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11SELDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11SELDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0
+; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, s2, v0.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11SELDAG-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11SELDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16:
+; GFX11SELDAG-FAKE16:       ; %bb.0:
+; GFX11SELDAG-FAKE16-NEXT:    s_clause 0x1
+; GFX11SELDAG-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11SELDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11SELDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11SELDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11SELDAG-FAKE16-NEXT:    v_cmp_class_f16_e64 s2, s2, 3
+; GFX11SELDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11SELDAG-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11SELDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11GLISEL-TRUE16-LABEL: sgpr_isnan_f16:
+; GFX11GLISEL-TRUE16:       ; %bb.0:
+; GFX11GLISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX11GLISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11GLISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GLISEL-TRUE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 0
+; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, s2, v0.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11GLISEL-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11GLISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11GLISEL-FAKE16-LABEL: sgpr_isnan_f16:
+; GFX11GLISEL-FAKE16:       ; %bb.0:
+; GFX11GLISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX11GLISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11GLISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11GLISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GLISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GLISEL-FAKE16-NEXT:    v_cmp_class_f16_e64 s2, s2, 3
+; GFX11GLISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11GLISEL-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11GLISEL-FAKE16-NEXT:    s_endpgm
   %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3)
   %sext = sext i1 %result to i32
   store i32 %sext, ptr addrspace(1) %out, align 4
@@ -212,8 +248,9 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: snan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 1
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: snan_f16:
@@ -226,8 +263,9 @@ define i1 @snan_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: snan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 1
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: snan_f16:
@@ -285,8 +323,9 @@ define i1 @qnan_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: qnan_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 2
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 2
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: qnan_f16:
@@ -299,8 +338,9 @@ define i1 @qnan_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: qnan_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 2
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 2
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: qnan_f16:
@@ -358,8 +398,9 @@ define i1 @posinf_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: posinf_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x200
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x200
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: posinf_f16:
@@ -372,8 +413,9 @@ define i1 @posinf_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: posinf_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x200
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x200
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: posinf_f16:
@@ -429,8 +471,9 @@ define i1 @neginf_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: neginf_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 4
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 4
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: neginf_f16:
@@ -443,8 +486,9 @@ define i1 @neginf_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: neginf_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 4
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 4
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: neginf_f16:
@@ -514,8 +558,9 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: posnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x100
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x100
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: posnormal_f16:
@@ -528,8 +573,9 @@ define i1 @posnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: posnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x100
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x100
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: posnormal_f16:
@@ -597,8 +643,9 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: negnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 8
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: negnormal_f16:
@@ -611,8 +658,9 @@ define i1 @negnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: negnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 8
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: negnormal_f16:
@@ -673,8 +721,9 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: possubnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x80
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x80
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: possubnormal_f16:
@@ -687,8 +736,9 @@ define i1 @possubnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: possubnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 0x80
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11GLISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x80
+; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11GLISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11GLISEL-FAKE16-LABEL: possubnormal_f16:
@@ -755,8 +805,9 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX11SELDAG-TRUE16-LABEL: negsubnormal_f16:
 ; GFX11SELDAG-TRUE16:       ; %bb.0:
 ; GFX11SELDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 16
-; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11SELDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 16
+; GFX11SELDAG-TRUE16-NEXT:    v_cmp_class_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11SELDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11SELDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11SELDAG-FAKE16-LABEL: negsubnormal_f16:
@@ -769,8 +820,9 @@ define i1 @negsubnormal_f16(half %x) nounwind {
 ; GFX11GLISEL-TRUE16-LABEL: negsubnormal_f16:
 ; GFX11GLISEL-TRUE16:       ; %bb.0:
 ; GFX11GLISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11GLISEL-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.l, 16
-; GFX11GLISEL-TRUE16-NEXT:    v_cndmask_b32_e...
[truncated]

@arsenm arsenm marked this pull request as ready for review September 5, 2025 07:25
@arsenm
Copy link
Contributor Author

arsenm commented Sep 10, 2025

ping

2 similar comments
@arsenm
Copy link
Contributor Author

arsenm commented Sep 16, 2025

ping

@arsenm
Copy link
Contributor Author

arsenm commented Sep 25, 2025

ping

@jayfoad
Copy link
Contributor

jayfoad commented Sep 25, 2025

@broxigarchen @Sisyph are you OK with the true16 regressions?

This fixes a regression exposed after 4454152.
This introduces a few small regressions for true16. There are more cases
where the value can propagate through subregister extracts which need
new handling. They're also small enough that perhaps there's a way to avoid
needing to deal with this case in the first place.
@arsenm arsenm force-pushed the users/arsenm/amdgpu/remove-redundant-recursive-copy-folding-fold-imm-true16-regression branch from 2cf303b to d2d025e Compare September 25, 2025 11:12
%0:vgpr_32 = COPY $vgpr1
%1:vgpr_32 = COPY $vgpr0
%3:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
%4:vgpr_32 = COPY %3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is illegal. I would consider it ill-formed input and not a suitable test. If this IR is generated, we should fix whatever is generating it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I pulled this right out of the failing test, But I was told this is hard to fix and we'll have to deal with it for a while?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave it as-is, and leave removal for whoever ends up fixing the machine verifier

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Making the verifier block this is hard to do. Fixing the ISel or whatever code generates %4:vgpr_32 = COPY %3 should not be. Do you know what code that is?

Copy link
Contributor

@broxigarchen broxigarchen Sep 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry this PR got burried in my mailbox. Hi Matt can you share the test which generates this MIR?

I've recently merged several patches to address this illegal COPY issue, so this problem might have gone.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is reduced from true16-imm-folded-to-0-regression.ll also added here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can land this pr yet. I believe the code provides a correctness function for true16 that can be seen in the alignbit test https://github.com/llvm/llvm-project/pull/157032/files#r2382776092. So the steps would be 1) Wait for #160891 and the follow up fix to si-fix-sgpr-copies. 2) Determine a better place to do the correctness function this folding code provides 3) Delete the folding code in this patch.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Matt for pointing the ll test. I just ran some quick test, #160891 + #154875 could fix this issue in the ealier pipeline.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But #160891 is not really a bug fix, just a workaround.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Jay. Yes I believe there are two issues:

  1. ISel generating COPY with mismatched vgpr size. This should be fixed by [AMDGPU][True16][CodeGen] update isel pattern with vgpr16 for 16 bit types #154875
  2. There is another issue exposes by the patch above, that there is one case si-fix-sgpr-copy failed to handle. [AMDGPU][True16][CodeGen] Avoid setting hi part in copysign #160891 is a work around for the second issue and I will create a seperate patch to fix it.

The first issue should be a duplicated one with this patch

; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 30
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY [[S_MOV_B32_]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output before this patch is legal, the output after is not, we don't have an instruction that can lower vgpr_16 = COPY sgpr_32. The impression I'm getting is that the code that is deleted in this patch is required for true16 correctness, at least for now. There is a separate issue that is causing an incorrect fold into v_bfi_b32, maybe this other patch (#157032) is the fix or part of it

@jayfoad
Copy link
Contributor

jayfoad commented Sep 26, 2025

Making the verifier block this is hard to do. Fixing the ISel or whatever code generates %4:vgpr_32 = COPY %3 should not be. Do you know what code that is?

Here's what I see, before twoaddressinstruction:

bb.0.bb:
  liveins: $vgpr0, $vgpr1
  %9:vgpr_32 = COPY killed $vgpr1
  %8:vgpr_32 = COPY killed $vgpr0
  %10:vgpr_32 = REG_SEQUENCE killed %8:vgpr_32, %subreg.lo16, undef %11:sreg_32, %subreg.hi16
  %13:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
  %14:vgpr_32 = REG_SEQUENCE killed %13:vgpr_16, %subreg.lo16, undef %15:sreg_32, %subreg.hi16
  %18:vgpr_32 = V_BFI_B32_e64 32767, killed %14:vgpr_32, killed %10:vgpr_32, implicit $exec
  %21:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, killed %9:vgpr_32, implicit $exec
  %22:vgpr_16 = V_CNDMASK_B16_t16_e64 0, 15360, 0, killed %18.lo16:vgpr_32, killed %21:sreg_32_xm0_xexec, 0, implicit $exec
  %24:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %22:vgpr_16, 0, 0, 0, 0, implicit $mode, implicit $exec
  $vgpr0 = COPY killed %24:vgpr_32
  SI_RETURN implicit killed $vgpr0

After twoaddressinstruction:

bb.0.bb:
  liveins: $vgpr0, $vgpr1
  %9:vgpr_32 = COPY killed $vgpr1
  %8:vgpr_32 = COPY killed $vgpr0
  undef %10.lo16:vgpr_32 = COPY killed %8:vgpr_32
  %13:vgpr_16 = V_MOV_B16_t16_e64 0, 15360, 0, implicit $exec
  undef %14.lo16:vgpr_32 = COPY killed %13:vgpr_16
  %18:vgpr_32 = V_BFI_B32_e64 32767, killed %14:vgpr_32, killed %10:vgpr_32, implicit $exec
  %21:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, killed %9:vgpr_32, implicit $exec
  %22:vgpr_16 = V_CNDMASK_B16_t16_e64 0, 15360, 0, killed %18.lo16:vgpr_32, killed %21:sreg_32_xm0_xexec, 0, implicit $exec
  %24:vgpr_32 = nofpexcept V_PACK_B32_F16_t16_e64 0, killed %22:vgpr_16, 0, 0, 0, 0, implicit $mode, implicit $exec
  $vgpr0 = COPY killed %24:vgpr_32
  SI_RETURN implicit killed $vgpr0

So two addressinstruction introduces the dubious COPY undef %10.lo16:vgpr_32 = COPY killed %8:vgpr_32, but it comes from an equally dubious instruction %10:vgpr_32 = REG_SEQUENCE killed %8:vgpr_32, %subreg.lo16, undef %11:sreg_32, %subreg.hi16 (where %8 is 32-bit but it being used as a 16-bit input to the REG_SEQUENCE) that came straight out of isel.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants